severeinjury <- read_csv("~/Downloads/severeinjury.csv",
col_types = cols(Address2 = col_skip(),
EventDate = col_date(format = "%m/%d/%Y"),
ID = col_character(), Inspection = col_skip()))
colnames(severeinjury)[colnames(severeinjury)=="Primary NAICS"] <- "naics"
colnames(severeinjury)[colnames(severeinjury)=="Part of Body Title"] <- "bodyPart"
severeinjury$month = month(severeinjury$EventDate)
severeinjury$year= year(severeinjury$EventDate)
Nulls= severeinjury%>%
summarise_all(funs(sum(is.na(.))))
Nulls=melt(Nulls)
## No id variables; using all as measure variables
kable(Nulls)
| variable | value |
|---|---|
| ID | 0 |
| UPA | 0 |
| EventDate | 0 |
| Employer | 0 |
| Address1 | 11 |
| City | 11 |
| State | 0 |
| Zip | 13 |
| Latitude | 4 |
| Longitude | 4 |
| naics | 3 |
| Hospitalized | 0 |
| Amputation | 2 |
| Final Narrative | 0 |
| Nature | 0 |
| NatureTitle | 0 |
| Part of Body | 0 |
| bodyPart | 0 |
| Event | 0 |
| EventTitle | 0 |
| Source | 0 |
| SourceTitle | 0 |
| Secondary Source | 23509 |
| Secondary Source Title | 23509 |
| month | 0 |
| year | 0 |
There are some columns that are almost entirely null. For right now, I am leaving everything because I do not know what we might want to do in the future. It would make sense for secondary source to be mainly nulll, because sometimes there is not a secondary source.
From this, we also see that there are three records that do not have a NAISC code. Since we are going to be breaking up the records into two different files, the service sector and the non service sector, we will need to get rid of those.
severeinjury = severeinjury %>%
drop_na(naics)
I want to make sure that all the records have a full 6 digit naics code before I remove them.
severeWithdigits = severeinjury %>%
mutate(NumberOfNaics = nchar(naics))
max(severeWithdigits$NumberOfNaics)
## [1] 6
min(severeWithdigits$NumberOfNaics)
## [1] 2
The max number of digits is 6 which is good, but the minimum is 2. Let’s check how many records have less than 6.
sum(severeWithdigits$NumberOfNaics <= 5)
## [1] 131
There are 131 records that have naisc codes that either need to be handled seperately or just thrown out, for now I am going to remove them by putting them in a different data set.
NotRealNaisc = severeWithdigits %>%
filter( NumberOfNaics <= 5)
Injury = severeWithdigits %>%
filter( NumberOfNaics == 6)
Utilities = Injury %>%
filter(naics >= 220000 & naics <230000)
Service = Injury %>%
filter( naics >= 400000)
Service = rbind(Service, Utilities)
NonServiceTop = Injury %>%
filter(naics < 400000 & naics >= 230000)
NonServiceBottom = Injury %>%
filter(naics < 220000)
NonService = rbind(NonServiceTop, NonServiceBottom)
sum(NonService$Hospitalized)/nrow(NonService)
## [1] 0.7574279
sum(Service$Hospitalized)/nrow(Service)
## [1] 0.8735039
WRITE SOMETHING HERE
Top10Service = Service %>%
group_by(NatureTitle) %>%
summarize( countSer = n()) %>%
filter(countSer > 190) %>%
mutate(sector = "NonService")
ggplot(Top10Service, aes(reorder(NatureTitle, countSer), countSer))+geom_bar(stat= "identity")+coord_flip()
Top10NonService = NonService %>%
group_by(NatureTitle) %>%
summarize( countNon = n()) %>%
filter(countNon > 250) %>%
mutate(sector = "NonService")
ggplot(Top10NonService, aes(reorder(NatureTitle, countNon), countNon))+geom_bar(stat= "identity")+coord_flip()
The top 4 of the categories remain the same for both sectors. It is after that that we see the differences.
Top10BodyPartService = Service %>%
group_by(bodyPart) %>%
summarize( count = n()) %>%
filter(count >500)
ggplot(Top10BodyPartService, aes(reorder(bodyPart, count), count))+geom_bar(stat= "identity")+coord_flip()
Top10BodyPartNonService = NonService %>%
group_by(bodyPart) %>%
summarize( count = n()) %>%
filter(count >500)
ggplot(Top10BodyPartNonService, aes(reorder(bodyPart, count), count))+geom_bar(stat= "identity")+coord_flip()
Lets look and see if there are any points from the last 3 years that have extremely high points of data.
ServiceDate = Service %>%
group_by(month,year) %>%
summarize(numInjuries = n()) %>%
mutate(FullDate = paste(month,year, sep = "/"))
NonServiceDate = NonService %>%
group_by(month,year) %>%
summarize(numInjuries = n()) %>%
mutate(FullDate = paste(month,year, sep = "/"))
p=ggplot()+ geom_line(data=ServiceDate, aes(group=1,x= month,y= numInjuries), color = "blue")+ geom_line(data=NonServiceDate, aes(group=1,x= month,y= numInjuries), color = "red")+facet_wrap(~ year, ncol = 4)
ggplotly(p)
Something interesting about this is they follow a very similar pattern. That really isn’t something I would expect. I would have thought that they would be a bit more random, however the tredns seem to follow eachother. Which makes me think there is some outside force that is making them do more reports that month etc. We seem to see spikes in September and october.
Top10ServiceAug2016 = Service %>%
filter(month == 8, year == 2016) %>%
group_by(NatureTitle) %>%
summarize( countSer = n()) %>%
filter(countSer>10)
ggplot(Top10ServiceAug2016, aes(reorder(NatureTitle, countSer), countSer))+geom_bar(stat= "identity")+coord_flip()
Top10NonServiceAug2016 = NonService %>%
filter(month == 8, year == 2016) %>%
group_by(NatureTitle) %>%
summarize( count = n()) %>%
filter(count >15)
ggplot(Top10NonServiceAug2016, aes(reorder(NatureTitle, count), count))+geom_bar(stat= "identity")+coord_flip()
We see a lot more heat and light injuries, I wonder if the temeperature was extremely high that year in september?
InjuryAug = Injury %>%
filter( month ==8, year == 2016)
nrow(InjuryAug)/nrow(Injury)
## [1] 0.03325728
Do we have an even distribution of records across states?
ServiceStates = Service %>%
group_by(State) %>%
summarise(ServiceRecords = n())
NonServiceStates = NonService %>%
group_by(State)%>%
summarise(NonServiceRecords = n())
StatesRecords = left_join(ServiceStates, NonServiceStates)
## Joining, by = "State"
#View(StatesRecords)
ServiceStates$ServiceRank= NA
ServiceStates$ServiceRank[order(-ServiceStates$ServiceRecords)]= 1:nrow(ServiceStates)
NonServiceStates$NonServiceRank= NA
NonServiceStates$NonServiceRank[order(-NonServiceStates$NonServiceRecords)]= 1:nrow(NonServiceStates)
StateRecords = left_join(ServiceStates, NonServiceStates)
## Joining, by = "State"
Top 20 states for service and non service are different. This is semi interesting.
A rank of 1 means that that state had the most records.
StateRecords = StateRecords %>%
select(State,ServiceRank, NonServiceRank)
kable(StateRecords)
| State | ServiceRank | NonServiceRank |
|---|---|---|
| ALABAMA | 14 | 9 |
| ALASKA | 49 | 43 |
| AMERICAN SAMOA | 54 | 44 |
| ARIZONA | 35 | 37 |
| ARKANSAS | 15 | 12 |
| CALIFORNIA | 21 | 32 |
| COLORADO | 9 | 10 |
| CONNECTICUT | 20 | 23 |
| DELAWARE | 32 | 26 |
| DISTRICT OF COLUMBIA | 29 | 31 |
| FLORIDA | 2 | 2 |
| GEORGIA | 7 | 6 |
| GUAM | 50 | 42 |
| HAWAII | 44 | 41 |
| IDAHO | 24 | 21 |
| ILLINOIS | 5 | 5 |
| INDIANA | 45 | NA |
| IOWA | 47 | NA |
| KANSAS | 16 | 16 |
| KENTUCKY | 46 | 34 |
| LOUISIANA | 13 | 13 |
| MAINE | 22 | 25 |
| MARYLAND | 36 | 35 |
| MASSACHUSETTS | 11 | 17 |
| MICHIGAN | 37 | 51 |
| MINNESOTA | 43 | 52 |
| MISSISSIPPI | 19 | 15 |
| MISSOURI | 10 | 11 |
| MONTANA | 27 | 28 |
| NEBRASKA | 18 | 19 |
| NEVADA | 51 | NA |
| NEW HAMPSHIRE | 26 | 27 |
| NEW JERSEY | 8 | 18 |
| NEW MEXICO | 39 | 46 |
| NEW YORK | 6 | 8 |
| NORTH CAROLINA | 38 | 36 |
| NORTH DAKOTA | 25 | 20 |
| NORTHERN MARIANA ISLANDS | 53 | 48 |
| OHIO | 4 | 3 |
| OKLAHOMA | 17 | 14 |
| OREGON | 40 | 38 |
| PENNSYLVANIA | 3 | 4 |
| PUERTO RICO | 55 | 49 |
| RHODE ISLAND | 31 | 30 |
| SOUTH CAROLINA | 42 | 39 |
| SOUTH DAKOTA | 28 | 24 |
| TENNESSEE | 34 | 40 |
| TEXAS | 1 | 1 |
| UTAH | 41 | 50 |
| VERMONT | 52 | NA |
| VIRGINIA | 30 | 29 |
| WASHINGTON | 33 | 33 |
| WEST VIRGINIA | 23 | 22 |
| WISCONSIN | 12 | 7 |
| WYOMING | 48 | 45 |